Kernel Author:
Bhishan Poudel, Ph.D Contd. Astrophysics

Date    : Jan 31, 2020
Topic    : ngals5k z1.5

Introduction

Update

  1. Looked at gm0 vs gc0 (and gm1 vs gc1) 45 degree line and removed outliers.
  2. Find the weights for g_sq for given magnitude bins using smooth fitting curve.

Usual Filtering

df = df.query('calib_psfCandidate == 0.0')
df = df.query('deblend_nChild == 0.0')
df['ellip'] = np.hypot( df['ext_shapeHSM_HsmShapeRegauss_e1'] ,
                        df['ext_shapeHSM_HsmShapeRegauss_e2'] )
df = df.query('ellip < 2.0') # it was 1.5 before

#select only few columns after filtering:
cols_select = ['base_SdssCentroid_x', 'base_SdssCentroid_y',
                'base_SdssCentroid_xSigma','base_SdssCentroid_ySigma',
                'ext_shapeHSM_HsmShapeRegauss_e1','ext_shapeHSM_HsmShapeRegauss_e2',
                'base_SdssShape_flux']
df = df[cols_select]        

# drop all nans
df = df.dropna()

# additional columns
df['radius'] =  df.eval(""" ( (ext_shapeHSM_HsmSourceMoments_xx *  ext_shapeHSM_HsmSourceMoments_yy) \
                                          -  (ext_shapeHSM_HsmSourceMoments_xy**2 ) )**0.25 """)

Shape filtering
https://github.com/LSSTDESC/DC2-analysis/blob/master/tutorials/object_gcr_2_lensing_cuts.ipynb

df = df.query('ext_shapeHSM_HsmShapeRegauss_resolution >= 0.3')
df = df.query('ext_shapeHSM_HsmShapeRegauss_sigma <= 0.4')
df = df.query('ext_shapeHSM_HsmShapeRegauss_flag== 0.0')

Filter strongly lensed objects

  • Take the objects with centroids >154 pixels (remove strong lens objects).
    # exclude strong lens objects <=154 distance
    # The shape of lsst.fits file is 3998,3998 and center is 1699,1699.
    df['x_center'] = 1699
    df['y_center'] = 1699
    df['distance'] = ( (df['x[0]'] - df['x_center'])**2 + (df['x[1]'] - df['y_center'])**2 )**0.5
    df = df[df.distance > 154]
    

Imcat script

# create new columns and cleaning (four files)
lc -C -n fN -n id -N '1 2 x' -N '1 2 errx' -N '1 2 g' -n ellip -n flux -n radius < "${M9T}".txt  |  lc +all 'mag = %flux log10 -2.5 *'  |  cleancat 15  |  lc +all -r 'mag' > "${M9C}".cat


# merge 4 catalogs
mergecats 5 "${MC}".cat "${M9C}".cat "${LC}".cat "${L9C}".cat > ${catalogs}/merge.cat &&


lc -b +all 
'x = %x[0][0] %x[1][0] + %x[2][0] + %x[3][0] + 4 / %x[0][1] %x[1][1] + %x[2][1] + %x[3][1] + 4 / 2 vector'
'gm = %g[0][0] %g[1][0] + 2 / %g[0][1] %g[1][1] + 2 / 2 vector' 
'gc = %g[2][0] %g[3][0] + 2 / %g[2][1] %g[3][1] + 2 / 2 vector'   
'gmd = %g[0][0] %g[1][0] - 2 / %g[0][1] %g[1][1] - 2 / 2 vector' 
'gcd = %g[2][0] %g[3][0] - 2 / %g[2][1] %g[3][1] - 2 / 2 vector' 
< ${catalogs}/merge.cat > ${final}/final_${i}.cat

Notes

final_text.txt is created by imcat program after merging four lsst files (m,m9,l,l9) after cleaning.

Imports

In [30]:
import json, os,sys
import numpy as np
import pandas as pd
import seaborn as sns
import time
sns.set(color_codes=True)

import plotly
import ipywidgets

pd.set_option('display.max_columns',200)
time_start_notebook = time.time()

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

print([(x.__name__, x.__version__) for x in [np,pd,sns,plotly,ipywidgets]])
[('numpy', '1.18.4'), ('pandas', '1.0.3'), ('seaborn', '0.10.1'), ('plotly', '4.8.0'), ('ipywidgets', '7.5.1')]
In [31]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

Load the final text cleancat15 data

g_sq = g00 g00 + g10 g10
gmd_sq = gmd0**2 + gmd1**2
In [32]:
!head -2 ../data/cleancat/final_text_cleancat15_ngals5k_z0.5_125_280.txt
#       fN[0][0]       fN[1][0]       fN[2][0]       fN[3][0]       id[0][0]       id[1][0]       id[2][0]       id[3][0]           x[0]           x[1]     errx[0][0]     errx[0][1]     errx[1][0]     errx[1][1]     errx[2][0]     errx[2][1]     errx[3][0]     errx[3][1]        g[0][0]        g[0][1]        g[1][0]        g[1][1]        g[2][0]        g[2][1]        g[3][0]        g[3][1]    ellip[0][0]    ellip[1][0]    ellip[2][0]    ellip[3][0]     flux[0][0]     flux[1][0]     flux[2][0]     flux[3][0]   radius[0][0]   radius[1][0]   radius[2][0]   radius[3][0]      mag[0][0]      mag[1][0]      mag[2][0]      mag[3][0]          gm[0]          gm[1]          gc[0]          gc[1]         gmd[0]         gmd[1]         gcd[0]         gcd[1]
             125            125            125            125            673            669            679            665       2812.747      1504.8803         0.0243         0.0167          0.016         0.0271         0.0243         0.0167         0.0161         0.0271         1.0389           0.33        -1.0468        -0.2552         0.9883         0.3218        -1.0469        -0.2572      1.0900519      1.0774587       1.039371      1.0780313      37423.424      37664.673      37345.853      37586.541      4.1578392      4.2172688      4.1575487      4.2169835     -11.432859     -11.439836     -11.430606     -11.437581       -0.00395         0.0374        -0.0293         0.0323        1.04285         0.2926         1.0176         0.2895
In [33]:
names = "fN[0][0]       fN[1][0]       fN[2][0]       fN[3][0]       id[0][0]       id[1][0]       id[2][0]       id[3][0]           x[0]           x[1]     errx[0][0]     errx[0][1]     errx[1][0]     errx[1][1]     errx[2][0]     errx[2][1]     errx[3][0]     errx[3][1]        g[0][0]        g[0][1]        g[1][0]        g[1][1]        g[2][0]        g[2][1]        g[3][0]        g[3][1]    ellip[0][0]    ellip[1][0]    ellip[2][0]    ellip[3][0]     flux[0][0]     flux[1][0]     flux[2][0]     flux[3][0]   radius[0][0]   radius[1][0]   radius[2][0]   radius[3][0]      mag[0][0]      mag[1][0]      mag[2][0]      mag[3][0]          gm[0]          gm[1]          gc[0]          gc[1]         gmd[0]         gmd[1]         gcd[0]         gcd[1]"
print(names)
fN[0][0]       fN[1][0]       fN[2][0]       fN[3][0]       id[0][0]       id[1][0]       id[2][0]       id[3][0]           x[0]           x[1]     errx[0][0]     errx[0][1]     errx[1][0]     errx[1][1]     errx[2][0]     errx[2][1]     errx[3][0]     errx[3][1]        g[0][0]        g[0][1]        g[1][0]        g[1][1]        g[2][0]        g[2][1]        g[3][0]        g[3][1]    ellip[0][0]    ellip[1][0]    ellip[2][0]    ellip[3][0]     flux[0][0]     flux[1][0]     flux[2][0]     flux[3][0]   radius[0][0]   radius[1][0]   radius[2][0]   radius[3][0]      mag[0][0]      mag[1][0]      mag[2][0]      mag[3][0]          gm[0]          gm[1]          gc[0]          gc[1]         gmd[0]         gmd[1]         gcd[0]         gcd[1]
In [34]:
names = ['fN[0][0]','fN[1][0]','fN[2][0]','fN[3][0]',
 'id[0][0]','id[1][0]','id[2][0]','id[3][0]',
 'x[0]','x[1]',
 'errx[0][0]','errx[0][1]','errx[1][0]','errx[1][1]','errx[2][0]',
 'errx[2][1]','errx[3][0]','errx[3][1]',
 'g[0][0]','g[0][1]','g[1][0]','g[1][1]','g[2][0]','g[2][1]','g[3][0]','g[3][1]',
 'ellip[0][0]','ellip[1][0]','ellip[2][0]','ellip[3][0]',
 'flux[0][0]','flux[1][0]','flux[2][0]','flux[3][0]',
 'radius[0][0]','radius[1][0]','radius[2][0]','radius[3][0]',
 'mag[0][0]','mag[1][0]','mag[2][0]','mag[3][0]',
 'gm[0]','gm[1]','gc[0]', 'gc[1]',
 'gmd[0]','gmd[1]','gcd[0]','gcd[1]']


file_path = '../data/cleancat/final_text_cleancat15_ngals5k_z0.5_125_280.txt'


df = pd.read_csv(file_path,comment='#',engine='python',sep=r'\s\s+',
                 header=None,names=names)

print(df.shape)

# new columns
# df['g_sq'] = df['g[0][0]'] **2 + df['g[1][0]']**2 # only for imcat 00 and 10
# df['gmd_sq'] = df['gmd[0]'] **2 + df['gmd[1]']**2

df['g_sq'] = df['g[0][0]'] **2 + df['g[0][1]']**2
df['gmd_sq'] = df['gmd[0]'] **2 + df['gmd[1]']**2

df['gm_sq'] = df['gm[0]']**2 + df['gm[1]']**2
df['gc_sq'] = df['gc[0]']**2 + df['gc[1]']**2

df['mag_mono'] = (df['mag[0][0]'] + df['mag[1][0]'] ) / 2
df['mag_chro'] = (df['mag[2][0]'] + df['mag[3][0]'] ) / 2

df.head()
(35461, 50)
Out[34]:
fN[0][0] fN[1][0] fN[2][0] fN[3][0] id[0][0] id[1][0] id[2][0] id[3][0] x[0] x[1] errx[0][0] errx[0][1] errx[1][0] errx[1][1] errx[2][0] errx[2][1] errx[3][0] errx[3][1] g[0][0] g[0][1] g[1][0] g[1][1] g[2][0] g[2][1] g[3][0] g[3][1] ellip[0][0] ellip[1][0] ellip[2][0] ellip[3][0] flux[0][0] flux[1][0] flux[2][0] flux[3][0] radius[0][0] radius[1][0] radius[2][0] radius[3][0] mag[0][0] mag[1][0] mag[2][0] mag[3][0] gm[0] gm[1] gc[0] gc[1] gmd[0] gmd[1] gcd[0] gcd[1] g_sq gmd_sq gm_sq gc_sq mag_mono mag_chro
0 125 125 125 125 673 669 679 665 2812.74700 1504.88030 0.0243 0.0167 0.0160 0.0271 0.0243 0.0167 0.0161 0.0271 1.0389 0.3300 -1.0468 -0.2552 0.9883 0.3218 -1.0469 -0.2572 1.090052 1.077459 1.039371 1.078031 37423.4240 37664.6730 37345.8530 37586.5410 4.157839 4.217269 4.157549 4.216983 -11.432859 -11.439836 -11.430606 -11.437581 -0.00395 0.03740 -0.02930 0.03230 1.04285 0.29260 1.01760 0.28950 1.188213 1.173151 0.001414 0.001902 -11.436348 -11.434093
1 125 125 125 125 3624 3617 3627 3615 1476.06470 2177.74190 0.0575 0.0381 0.0362 0.0575 0.0573 0.0381 0.0362 0.0580 1.0092 0.2622 -1.0400 -0.1400 0.9842 0.2581 -1.0684 -0.1422 1.042705 1.049381 1.017480 1.077822 16508.8310 16472.9350 16469.9190 16437.5730 4.225692 4.127967 4.224107 4.127593 -10.544291 -10.541927 -10.541729 -10.539594 -0.01540 0.06110 -0.04210 0.05795 1.02460 0.20110 1.02630 0.20015 1.087233 1.090246 0.003970 0.005131 -10.543109 -10.540662
2 125 125 125 125 4006 3988 3998 3986 330.59460 2532.93750 0.0737 0.0814 0.0701 0.0828 0.0731 0.0812 0.0699 0.0830 -0.1782 0.8218 -0.3298 0.9130 -0.1521 0.8391 -0.3764 0.9629 0.840899 0.970740 0.852774 1.033854 6819.5376 6803.3047 6798.0851 6809.5060 4.200198 4.138292 4.185054 4.139294 -9.584387 -9.581800 -9.580966 -9.582789 -0.25400 0.86740 -0.26425 0.90100 0.07580 -0.04560 0.11215 -0.06190 0.707110 0.007825 0.816899 0.881629 -9.583094 -9.581878
3 125 125 125 125 3765 3756 3760 3749 2062.74510 2337.12310 0.0578 0.0389 0.0566 0.0406 0.0576 0.0388 0.0567 0.0406 0.7578 -0.7258 0.6626 -0.8152 0.7534 -0.7081 0.6832 -0.8223 1.049308 1.050519 1.033933 1.069083 22224.8360 21847.7210 22168.7250 21830.3450 4.870533 4.787105 4.854372 4.789153 -10.867096 -10.848515 -10.864352 -10.847651 0.71020 -0.77050 0.71830 -0.76520 0.04760 0.04470 0.03510 0.05710 1.101046 0.004264 1.098054 1.101486 -10.857805 -10.856001
4 125 125 125 125 2221 2338 2229 2337 835.16188 811.38905 0.2591 0.2448 0.2588 0.2426 0.2692 0.2516 0.2718 0.2589 0.3038 -0.8074 0.2382 -0.8645 0.3133 -0.6102 0.1663 -0.8017 0.862664 0.896716 0.685931 0.818766 6034.2829 6037.6519 6177.2667 6151.1167 4.543778 4.522524 4.822348 4.762993 -9.451564 -9.452170 -9.476991 -9.472385 0.27100 -0.83595 0.23980 -0.70595 0.03280 0.02855 0.07350 0.09575 0.744189 0.001891 0.772253 0.555869 -9.451867 -9.474688

Plot g-squared for monochromatic and chromatic files

In [35]:
if not os.path.isdir('images'):
    os.makedirs('images')
In [36]:
fig,ax = plt.subplots(1,2,figsize=(14,6))

df['gm_sq'].plot.hist(bins=50,ax=ax[0],color='b',label='mono')
ax[0].set_xlabel('gm_sq')
ax[0].legend()
ax[0].set_title('g-squared histogram for mono')

# gcsq
df['gc_sq'].plot.hist(bins=50,ax=ax[1],color='r',label='chro')
ax[1].set_xlabel('gc_sq')
ax[1].legend()
ax[1].set_title('g-squared histogram for chro')

# savefig
plt.savefig('images/gmsq_gcsq_hist.png')
In [37]:
plt.figure(figsize=(12,8))
sns.distplot(df['g_sq'],label='g_sq')
sns.distplot(df['gmd_sq'],label='gmd_sq')

plt.legend()
Out[37]:
<matplotlib.legend.Legend at 0x7fe5e6982ad0>

Contour Plots for Diagonal cuts

In [38]:
import plotly
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
In [39]:
def matrix_of_number_density_from_two_cols(df,xcol,ycol,N):
    """Create grid of number density of two columns
    
    - Find the absolute max from two columns.
    - Create N bins -absMax to +absMax.
    - Return a matrix of N*N shape.
    """
    from itertools import product
    
    # derived variables
    xlabel = xcol
    ylabel = ycol
    xlabel1 = xlabel + '_cat_labels'
    ylabel1 = ylabel + '_cat_labels'
    
    xlabel2 = xlabel + '_cat'
    ylabel2 = ylabel + '_cat'
    colname = 'cat_freq'
    
    # take only xlabel and ylabel columns
    dx = df[[xlabel, ylabel]].copy()
    
    # get absolute maximum from two columns
    tolerance = 0.0000001
    max_abs_xcol_ycol = df[[xcol,ycol]].describe().iloc[[3,-1],:].abs().max().max()
    
    # create 1d array with N+1 values to create N bins
#     bins = np.linspace(-max_abs_xcol_ycol-tolerance, max_abs_xcol_ycol+tolerance,N+1)
    bins = np.linspace(0, max_abs_xcol_ycol,N+1)

    # create N bins
    dx[xlabel1] = pd.cut(dx[xlabel], bins, labels=np.arange(N))
    dx[ylabel1] = pd.cut(dx[ylabel], bins, labels=np.arange(N))

    # count number of points in each bin
    dx[colname] = dx.groupby([xlabel1,ylabel1])[xlabel1].transform('count')

    # drop duplicates
    dx1 = dx.drop_duplicates(subset=[xlabel1,ylabel1])[[xlabel1,ylabel1,colname]]

    # use permutation to get the grid of N * N
    perms = list(product(range(N), range(N)))
    x = [i[0] for i in perms]
    y = [i[1] for i in perms]
    dx2 = pd.DataFrame({xlabel1: x, ylabel1: y, colname:0})

    # update dx2 to merge frequency values
    dx2.update(dx2.drop(colname,1).merge(dx1,how='left'))
    dx2 = dx2.astype(int)

    # z values to plot heatmap
    z = dx2[colname].values.reshape(N,N)
    z = z.T

    return z

Transform and scale the data

In [40]:
def transform_scale(z,transform='linear',scale=None):
    """Transform and scale given 1d numpy array.
    
    transform: linear, log, sqrt, sinh, arcsinh
    scale    : minmax, zscale
    
    """
    #==================================
    # linear, log, sqrt, arcsinh, sinh 
    #
    # we need linear tranform option to compare.
    if transform == 'linear':
        z = z

    if transform == 'log':
        z = np.log1p(z)
        
    if transform == 'sqrt':
        z = np.sqrt(z)

    if transform == 'sinh':
        z = np.sinh(z)
        
    if transform == 'arcsinh':
        z = np.arcsinh(z)
    
    #===============================
    # scaling minmax and zscale
    if scale== 'minmax':
        z = z / (z.max()-z.min())
    if scale == 'zscale':
        z = (z-z.mean()) / z.std()
    if scale is None:
        z = z
        
    return z

plot the countours

In [41]:
def plot_contour(Z,colorscale,x1y1x2y2=None,
                 title='Contour plot',
                 xlabel='',
                 ylabel=''):
    """Plot the contour plot.

    Contour plot from given 2d array.
    
    Example:
    -----------
    z  = matrix_of_number_density_from_two_cols(df,'gsq','gmdsq',N)
    z1 = transform_scale(z,transform=transform,scale=scale)

    plot_contour(z1, colorscale='Viridis', x1y1x2y2=[10,0,99,90],
            title=f'Contour plot: {transform}+{scale}',
            xlabel='Bin number of gsq',
            ylabel='Bin number of gmsq')
    
    """

    trace1 = go.Contour(z=Z, colorscale=colorscale)
    axis_layout = dict(
        showticklabels = True
    )
    
    xaxis = {**axis_layout, **{'title':f'{xlabel}'}}
    yaxis = {**axis_layout, **{'title':f'{ylabel}'}}

    layout = go.Layout(
        width=1000,
        height=1000,
        autosize=False,
        title=f'{title} ',
        xaxis = xaxis,
        yaxis = yaxis,
    )
    

    data = [trace1]
    
    if x1y1x2y2:

        # center line 
        p2x1, p2y1 = 0,0
        p2x2, p2y2 = 99,99
        sc1 = go.Scatter(x=[p2x1,p2x2],
                         y=[p2y1,p2y2],
                         mode = 'lines+markers',
                         name = f'line ({p2x1},{p2y1}) to ({p2x2},{p2y2})')
        
        sc2 = go.Scatter(x=[x1y1x2y2[0],x1y1x2y2[2]],
                         y=[x1y1x2y2[1],x1y1x2y2[3]],
                         mode = 'lines+markers',
                         name = f'line ({x1y1x2y2[0]},{x1y1x2y2[1]}) \
                         to ({x1y1x2y2[2]},{x1y1x2y2[3]})')


        data = [trace1, sc1, sc2]

    fig = go.Figure( data=data, layout=layout )
    
    # update figure
    fig.update_layout(
    xaxis = dict(tickmode = 'linear',dtick = 5),
    yaxis = dict(tickmode = 'linear',dtick = 5),
    )

    iplot(fig)
    return None

N = 100
transform='log' # linear log sqrt sinh
scale='minmax'


xcol,ycol = 'g_sq', 'gmd_sq'
z  = matrix_of_number_density_from_two_cols(df,xcol,ycol,N)
z1 = transform_scale(z,transform=transform,scale=scale)

plot_contour(z1, colorscale='Viridis', x1y1x2y2=[10,0,99,90],
            title=f'Contour plot for transform =  {transform} and scaling = {scale}',
            xlabel=f'Bin number of {xcol}',
            ylabel=f'Bin number of {ycol}')

Grid of N*N from g_sq and gmd_sq

In [42]:
N = 100
xcol = 'g_sq'
ycol = 'gmd_sq'
max_abs_xcol_ycol = df[[xcol,ycol]].abs().max().max()

print(max_abs_xcol_ycol)

df[df[xcol]==max_abs_xcol_ycol]
3.91756097
Out[42]:
fN[0][0] fN[1][0] fN[2][0] fN[3][0] id[0][0] id[1][0] id[2][0] id[3][0] x[0] x[1] errx[0][0] errx[0][1] errx[1][0] errx[1][1] errx[2][0] errx[2][1] errx[3][0] errx[3][1] g[0][0] g[0][1] g[1][0] g[1][1] g[2][0] g[2][1] g[3][0] g[3][1] ellip[0][0] ellip[1][0] ellip[2][0] ellip[3][0] flux[0][0] flux[1][0] flux[2][0] flux[3][0] radius[0][0] radius[1][0] radius[2][0] radius[3][0] mag[0][0] mag[1][0] mag[2][0] mag[3][0] gm[0] gm[1] gc[0] gc[1] gmd[0] gmd[1] gcd[0] gcd[1] g_sq gmd_sq gm_sq gc_sq mag_mono mag_chro
25431 233 233 233 233 3779 3778 3700 3695 1603.9956 2291.9362 0.0266 0.0263 0.025 0.0311 0.0342 0.0275 0.0259 0.0309 -0.3001 1.9564 -0.8341 0.8838 -0.1789 1.2475 -0.6942 0.9649 1.979283 1.215247 1.260263 1.188674 16521.265 17662.844 19831.667 19038.667 4.253503 4.388925 4.988621 4.654104 -10.545108 -10.617652 -10.743398 -10.699091 -0.5671 1.4201 -0.43655 1.1062 0.267 0.5363 0.25765 0.1413 3.917561 0.358907 2.338286 1.414254 -10.58138 -10.721245
In [43]:
# create 1d array with N+1 values to create N bins
bins = np.linspace(0, max_abs_xcol_ycol,N+1)

# bins dict
bins_dict = {i:v for i,v in enumerate(bins)}

# create N bins
ser_ycol_bins = pd.cut(df[ycol], bins=bins,)

df['g_sq_bins'] = ser_ycol_bins
df[['g_sq','g_sq_bins']].head()
Out[43]:
g_sq g_sq_bins
0 1.188213 (1.136, 1.175]
1 1.087233 (1.058, 1.097]
2 0.707110 (0.0, 0.0392]
3 1.101046 (0.0, 0.0392]
4 0.744189 (0.0, 0.0392]
In [44]:
# bin 0  ==> 0          to 0.03994369
# bin 1  ==> 0.03994369 to 0.07988739
# bin 10 ==> 0.39943694 to 0.43938063

bins[10], bins[11]
Out[44]:
(0.39175609699999997, 0.4309317067)

Analysis of Points above gmd_sq bin number 10

What is the corresponding gmsq value to y-axis bin number 10 (11th bin)?

The 100*100 bin is created from absMax of g_sq and gmd_sq. Then the line 0 to absMax is divided into 100 parts and bins are created.

bin 10 for gmd_sq is 0.39943694 to 0.43938063

In [45]:
# take all points where gmd_sq > 10th bin

df_gmd_sq_lt_bin10 = df[df.gmd_sq > bins[10]]

print(df_gmd_sq_lt_bin10.shape)

df_gmd_sq_lt_bin10.head()
(12573, 57)
Out[45]:
fN[0][0] fN[1][0] fN[2][0] fN[3][0] id[0][0] id[1][0] id[2][0] id[3][0] x[0] x[1] errx[0][0] errx[0][1] errx[1][0] errx[1][1] errx[2][0] errx[2][1] errx[3][0] errx[3][1] g[0][0] g[0][1] g[1][0] g[1][1] g[2][0] g[2][1] g[3][0] g[3][1] ellip[0][0] ellip[1][0] ellip[2][0] ellip[3][0] flux[0][0] flux[1][0] flux[2][0] flux[3][0] radius[0][0] radius[1][0] radius[2][0] radius[3][0] mag[0][0] mag[1][0] mag[2][0] mag[3][0] gm[0] gm[1] gc[0] gc[1] gmd[0] gmd[1] gcd[0] gcd[1] g_sq gmd_sq gm_sq gc_sq mag_mono mag_chro g_sq_bins
0 125 125 125 125 673 669 679 665 2812.74700 1504.8803 0.0243 0.0167 0.0160 0.0271 0.0243 0.0167 0.0161 0.0271 1.0389 0.3300 -1.0468 -0.2552 0.9883 0.3218 -1.0469 -0.2572 1.090052 1.077459 1.039371 1.078031 37423.4240 37664.6730 37345.8530 37586.5410 4.157839 4.217269 4.157549 4.216983 -11.432859 -11.439836 -11.430606 -11.437581 -0.00395 0.03740 -0.02930 0.03230 1.04285 0.29260 1.01760 0.28950 1.188213 1.173151 0.001414 0.001902 -11.436348 -11.434093 (1.136, 1.175]
1 125 125 125 125 3624 3617 3627 3615 1476.06470 2177.7419 0.0575 0.0381 0.0362 0.0575 0.0573 0.0381 0.0362 0.0580 1.0092 0.2622 -1.0400 -0.1400 0.9842 0.2581 -1.0684 -0.1422 1.042705 1.049381 1.017480 1.077822 16508.8310 16472.9350 16469.9190 16437.5730 4.225692 4.127967 4.224107 4.127593 -10.544291 -10.541927 -10.541729 -10.539594 -0.01540 0.06110 -0.04210 0.05795 1.02460 0.20110 1.02630 0.20015 1.087233 1.090246 0.003970 0.005131 -10.543109 -10.540662 (1.058, 1.097]
9 125 125 125 125 1107 1107 1120 1110 231.78355 2476.4725 0.1350 0.0985 0.0934 0.1466 0.1341 0.0994 0.0942 0.1476 1.0104 0.2082 -1.0733 -0.0683 1.0468 0.2275 -1.1408 -0.0708 1.031627 1.075471 1.071236 1.142995 6155.8654 6185.8754 6138.1610 6168.6647 4.012843 4.027674 4.010899 4.025892 -9.473223 -9.478503 -9.470096 -9.475478 -0.03145 0.06995 -0.04700 0.07835 1.04185 0.13825 1.09380 0.14915 1.064255 1.104564 0.005882 0.008348 -9.475863 -9.472787 (1.097, 1.136]
11 125 125 125 125 3530 3472 3535 3483 1206.80990 2109.9034 0.0120 0.0079 0.0074 0.0119 0.0120 0.0080 0.0074 0.0120 0.7728 -0.5090 -0.7244 0.6217 0.7657 -0.5007 -0.7310 0.6325 0.925365 0.954603 0.914875 0.966653 113953.3100 113060.1300 113725.0100 112838.1800 5.003308 4.959239 5.003152 4.959238 -12.641817 -12.633274 -12.639640 -12.631140 0.02420 0.05635 0.01735 0.06590 0.74860 -0.56535 0.74835 -0.56660 0.856301 0.880023 0.003761 0.004644 -12.637545 -12.635390 (0.862, 0.901]
14 125 125 125 125 2746 2735 2754 2733 1377.81840 1231.8038 0.0861 0.1384 0.1375 0.0916 0.0858 0.1377 0.1378 0.0921 -1.0946 -0.1038 1.0728 -0.0672 -1.0392 -0.0965 1.1074 -0.0763 1.099511 1.074903 1.043671 1.110025 6867.7531 6904.5410 6863.4534 6891.0744 4.109582 4.190900 4.112447 4.190650 -9.592037 -9.597837 -9.591357 -9.595717 -0.01090 -0.08550 0.03410 -0.08640 -1.08370 -0.01830 -1.07330 -0.01010 1.208924 1.174741 0.007429 0.008628 -9.594937 -9.593537 (1.136, 1.175]
In [46]:
# fN[0][0]  ==> lsst_mono_z1.5_000.fits
# fN[1][0]  ==> lsst_mono90_z1.5_000.fits
#
# id[0][0]  ==> id of given object for mono file number 0
In [47]:
# take only first file number 0 (it has m,m9,c and c9)

df_gmd_sq_lt_bin10_file0 = df_gmd_sq_lt_bin10[df_gmd_sq_lt_bin10['fN[0][0]'] == 0]

# add radius for mono
df_gmd_sq_lt_bin10_file0['radius_mono'] = \
(df_gmd_sq_lt_bin10_file0['radius[0][0]'] + 
 df_gmd_sq_lt_bin10_file0['radius[1][0]'] ) /2.0

print(df_gmd_sq_lt_bin10_file0.shape)

df_gmd_sq_lt_bin10_file0.head()
(0, 58)
Out[47]:
fN[0][0] fN[1][0] fN[2][0] fN[3][0] id[0][0] id[1][0] id[2][0] id[3][0] x[0] x[1] errx[0][0] errx[0][1] errx[1][0] errx[1][1] errx[2][0] errx[2][1] errx[3][0] errx[3][1] g[0][0] g[0][1] g[1][0] g[1][1] g[2][0] g[2][1] g[3][0] g[3][1] ellip[0][0] ellip[1][0] ellip[2][0] ellip[3][0] flux[0][0] flux[1][0] flux[2][0] flux[3][0] radius[0][0] radius[1][0] radius[2][0] radius[3][0] mag[0][0] mag[1][0] mag[2][0] mag[3][0] gm[0] gm[1] gc[0] gc[1] gmd[0] gmd[1] gcd[0] gcd[1] g_sq gmd_sq gm_sq gc_sq mag_mono mag_chro g_sq_bins radius_mono

Regions above and below for gsq vs gmdsq contour plot

For example, take points:
Lower line below the diagonal line
point on x-axis: x1,y1 = 10,0
point on y-axis: x2,y2 = 99,90

here 20,0,99,80 are bin number, their values are obtained from bins_dict
x1,y1 = bins_dict[10], bins_dict[0]
x2,y2 = bins_dict[99], bins_dict[90]


Equation of straight line:
y-y1 = y2-y1 * (x-x1)
       -----
       x2-x1

boundary: (x2-x1) * (y-y1) - (y2-y1) * (x-x1)
In [48]:
N = 100
bins = np.linspace(0, max_abs_xcol_ycol,N+1)
bins_dict = {i:v for i,v in enumerate(bins)}

# points from contour plot
x1y1x2y2 = [10,0,99,90]
print(x1y1x2y2)

# actual values of x1, y1, x2, and y2
x1,y1 = bins_dict[x1y1x2y2[0]], bins_dict[x1y1x2y2[1]]
x2,y2 = bins_dict[x1y1x2y2[2]], bins_dict[x1y1x2y2[3]]

df['above'] = df.eval(" ( (@x2-@x1) * gmd_sq )  >= ( (@y2-@y1) * (g_sq - @x1 )) ")

print(df['above'].value_counts())
print()
print(df['above'].value_counts(normalize=True))
[10, 0, 99, 90]
True     23205
False    12256
Name: above, dtype: int64

True     0.654381
False    0.345619
Name: above, dtype: float64
In [49]:
df[df.above==True]['gm_sq'].plot.hist(figsize=(12,8),bins=60,color='b',label='Above')

nabove = len(df[df.above==True])
nbelow = len(df[df.above==False])

above_pct = nabove/(nabove+nbelow)*100
below_pct = 100-above_pct
text = f'Above = {nabove:,} ({above_pct:,.0f}%)\nBelow = {nbelow:,} ( {below_pct:,.0f}%)'

plt.text(0.5,10_000,text,fontsize=14)
plt.legend()

plt.xlabel('gm_sq')
plt.title('gm_sq histograms for above and below cases', fontsize=20)

df[df.above==False]['gm_sq'].plot.hist(figsize=(12,8),bins=60,color='r',alpha=0.5,label='Below')
plt.legend()
plt.xlabel('gm_sq')
plt.title('gm_sq histograms below the boundary', fontsize=20)
Out[49]:
Text(0.5, 1.0, 'gm_sq histograms below the boundary')
In [50]:
df[df.above==True]['gm_sq'].plot.hist(figsize=(12,8),bins=60,color='r',alpha=0.5)
plt.xlabel('gm_sq')
plt.title('gm_sq histogram', fontsize=20)
Out[50]:
Text(0.5, 1.0, 'gm_sq histogram')
In [51]:
df[df.above==False]['gm_sq'].plot.hist(figsize=(12,8),bins=60,color='r',alpha=0.5,label='Below')
plt.legend()
plt.xlabel('gm_sq')
plt.title('gm_sq histograms below the boundary', fontsize=20)
Out[51]:
Text(0.5, 1.0, 'gm_sq histograms below the boundary')

Now, Take only the data above the lower diagonal as cleaned data

In [52]:
rows_before = df.shape[0]
df = df[df.above==True]

print(f'Before rows: {rows_before}, After rows: {df.shape[0]}')
Before rows: 35461, After rows: 23205
In [53]:
plt.figure(figsize=(12,8))
sns.distplot(df['gm_sq'],label='gm_sq')
plt.legend()
plt.savefig('images/gmsq_histogram_after_cleaning.png',dpi=300)

gm vs gc Plots

Equation of straight line:
y-y1 = y2-y1 * (x-x1)
       -----
       x2-x1

boundary: (x2-x1) * (y-y1) - (y2-y1) * (x-x1)
In [54]:
fig,ax = plt.subplots(1,2,figsize=(16,8))
df.plot.scatter(x='gm[0]',y='gc[0]', ax=ax[0])
df.plot.scatter(x='gm[1]',y='gc[1]', ax=ax[1])

ax[0].set_xlim(-2.0,2.0)
ax[0].set_ylim(-2.0,2.0)

ax[1].set_xlim(-2.0,2.0)
ax[1].set_ylim(-2.0,2.0)

# 45 degree line
n=2.0
ax[0].plot([-n,n],[-n,n],'r--')
ax[1].plot([-n,n],[-n,n],'r--')

plt.suptitle('gm vs gc plot',weight='bold',fontsize=24);

Remove outliers based on gm vs gc plot

In [55]:
# Two scatter plots
#===========================================
fig,ax = plt.subplots(1,2,figsize=(16,8))
plt.suptitle('gm vs gc plot',weight='bold',fontsize=24);

df.plot.scatter(x='gm[0]',y='gc[0]', ax=ax[0])
df.plot.scatter(x='gm[1]',y='gc[1]', ax=ax[1])

ax[0].set_xlim(-2.0,2.0)
ax[0].set_ylim(-2.0,2.0)

ax[1].set_xlim(-2.0,2.0)
ax[1].set_ylim(-2.0,2.0)

# 45 degree line
n=2.0
ax[0].plot([-n,n],[-n,n],'r--')
ax[1].plot([-n,n],[-n,n],'r--')



# Left plot gm0 vs gc0 parallel lines
#===========================================
x = df['gm[0]'].to_numpy()
y = df['gc[0]'].to_numpy()

# Find distance for parallel lines
sigma = (x - y).std()
n = 10
d = n * sigma
c = d/np.sqrt(2) 
y1 = x + c
y2 = x - c

# left plot parallel lines
ax[0].plot(y1,x,'b-.',label=f'{n} sigma')
ax[0].plot(y2,x,'b-.')


# Right plot gm1 vs gc1 parallel lines
#===========================================
x = df['gm[1]'].to_numpy()
y = df['gc[1]'].to_numpy()

sigma = (x - y).std()
n = 10
d = n * sigma
c = d/np.sqrt(2) 
y1 = x + c
y2 = x - c

ax[1].plot(y1,x,'b-.', label=f'{n} sigma')
ax[1].plot(y2,x,'b-.')


#=============
# Add legends
# plt.tight_layout()
ax[0].legend(loc='upper left')
ax[1].legend(loc='upper left')

plt.show()
In [56]:
# Remove outliers for gc0 vs gm0
n = 10
sigma = (df['gm[0]'] - df['gc[0]']).std()
d = n * sigma
c = d/np.sqrt(2)

cond_upper = (df['gc[0]'] - df['gm[0]'] <= c)
cond_below = (df['gm[0]'] - df['gc[0]'] <= c)

df = df[ cond_upper & cond_below ]


# df.plot.scatter(x='gm[0]',y='gc[0]')
In [57]:
# Remove outliers for gc1 vs gm1
n = 10
sigma = (df['gm[1]'] - df['gc[1]']).std()
d = n * sigma
c = d/np.sqrt(2)

cond_upper = (df['gc[1]'] - df['gm[1]'] <= c)
cond_below = (df['gm[1]'] - df['gc[1]'] <= c)

df = df[ cond_upper & cond_below ]


# df.plot.scatter(x='gm[1]',y='gc[1]')
In [58]:
# Two scatter plots
#===========================================
fig,ax = plt.subplots(1,2,figsize=(16,8))
plt.suptitle('gm vs gc plot',weight='bold',fontsize=24);

df.plot.scatter(x='gm[0]',y='gc[0]', ax=ax[0])
df.plot.scatter(x='gm[1]',y='gc[1]', ax=ax[1])

ax[0].set_xlim(-2.0,2.0)
ax[0].set_ylim(-2.0,2.0)

ax[1].set_xlim(-2.0,2.0)
ax[1].set_ylim(-2.0,2.0)

# 45 degree line
n=2.0
ax[0].plot([-n,n],[-n,n],'r--')
ax[1].plot([-n,n],[-n,n],'r--')



# Left plot gm0 vs gc0 parallel lines
#===========================================
x = df['gm[0]'].to_numpy()
y = df['gc[0]'].to_numpy()

# Find distance for parallel lines
sigma = (x - y).std()
n = 10
d = n * sigma
c = d/np.sqrt(2) 
y1 = x + c
y2 = x - c

# left plot parallel lines
ax[0].plot(y1,x,'b-.',label=f'{n} sigma')
ax[0].plot(y2,x,'b-.')




# Right plot gm1 vs gc1 parallel lines
#===========================================
x = df['gm[1]'].to_numpy()
y = df['gc[1]'].to_numpy()

sigma = (x - y).std()
n = 10
d = n * sigma
c = d/np.sqrt(2) 
y1 = x + c
y2 = x - c

ax[1].plot(y1,x,'b-.', label=f'{n} sigma')
ax[1].plot(y2,x,'b-.')


#=============
# Add legends
# plt.tight_layout()
ax[0].legend(loc='upper left')
ax[1].legend(loc='upper left')

plt.show()

Plot magnidude for different bin numbers

In [59]:
df.filter(regex='mag').head(2)
Out[59]:
mag[0][0] mag[1][0] mag[2][0] mag[3][0] mag_mono mag_chro
0 -11.432859 -11.439836 -11.430606 -11.437581 -11.436348 -11.434093
1 -10.544291 -10.541927 -10.541729 -10.539594 -10.543109 -10.540662
In [60]:
df[['gm_sq','gc_sq']].describe()
Out[60]:
gm_sq gc_sq
count 2.309700e+04 2.309700e+04
mean 3.610357e-02 3.447451e-02
std 9.465974e-02 9.356339e-02
min 5.000000e-08 1.450000e-07
25% 1.262953e-03 1.222600e-03
50% 3.922842e-03 3.719472e-03
75% 1.780393e-02 1.650081e-02
max 1.142832e+00 1.124436e+00
In [ ]:
def plot_bin_mag_mono_chro(nbins,show=False,ret=False):
    import os
    
    if not os.path.isdir('images'):
        os.makedirs('images')
    
    df['bins_mag_mono'] = pd.cut(df['mag_mono'],nbins)
    df['bins_mag_chro'] = pd.cut(df['mag_chro'],nbins)
    
    # show the text of bin counts
    text_mono = df.groupby('bins_mag_mono')['gm_sq'].agg(['count', 'mean'])
    text_mono = text_mono.reset_index().assign(pct_change = lambda x: x['mean'].pct_change())
    text_mono = text_mono.to_string().replace('mean','gm_sq')
    
    text_chro = df.groupby('bins_mag_chro')['gc_sq'].agg(['count', 'mean'])
    text_chro = text_chro.reset_index().assign(pct_change = lambda x: x['mean'].pct_change())
    text_chro = text_chro.to_string().replace('mean','gc_sq')
 
    # data to plot
    df_mono_gm_sq_mean = df.groupby('bins_mag_mono')['gm_sq'].mean()
    df_chro_gm_sq_mean = df.groupby('bins_mag_chro')['gc_sq'].mean()
    
    # plot
    fig,ax = plt.subplots(1,2,figsize=(12,8))
    
    # mono (plot the magnitude mean in each bins)
    df_mono_gm_sq_mean.plot(marker='o',ax=ax[0])
    ax[0].tick_params(axis='x', rotation=90)
    ax[0].set_ylabel('gm_sq_mean',fontsize=18)
    ax[0].set_xlabel('bin_mag_mono',fontsize=18)
    ax[0].set_title(f'gm_sq per magnitude bins with nbins = {nbins}')
    ax[0].text(0,0.5,text_mono,fontsize=14,va='center')
    ax[0].set_ylim(0,1)
    ax[0].set_yticks(np.arange(0, 1, step=0.1))
 
    # chro
    df_chro_gm_sq_mean.plot(marker='o',ax=ax[1])
    ax[1].tick_params(axis='x', rotation=90)
    ax[1].set_ylabel('gc_sq_mean',fontsize=18)
    ax[1].set_xlabel('bin_mag_chro',fontsize=18)
    ax[1].set_title(f'gc_sq per magnitude bins with nbins = {nbins}')
    ax[1].text(0,0.5,text_chro,fontsize=14,va='center')
    ax[1].set_ylim(0,1)
    ax[1].set_yticks(np.arange(0, 1, step=0.1))
    
    plt.tight_layout()
    plt.savefig(f'images/bin_mag_mono_chro_{nbins}.png')
    
    if show:
        plt.show()
    plt.close()
    
    if ret:
        return df_mono_gm_sq_mean, df_chro_gm_sq_mean

for nbins in range(5,15):
    plot_bin_mag_mono_chro(nbins,show=True)
In [ ]:
"""
Note:
These plots were much different previously without doing some filterings:

https://nbviewer.jupyter.org/github/bpRsh/2019_shear_analysis_after_dmstack/blob/master/Jan_2020/a01_jan8/a01_cleancat15_gc0_gm0.ipynb

""";

Magnitude weight column for Monochromatic case

In [ ]:
df['mag_mono'].plot.hist(bins=100)
In [ ]:
nbins = 9 # the bin number looks good in above gm_sq plots
df_mono, df_chro = plot_bin_mag_mono_chro(nbins,show=True,ret=True)
In [ ]:
# from plot above I can see from Kth point graph goes up.
# graph is flat upto Kth point then increases linearly to top of curve.

num_start_increasing = 3 
df_mono.index[num_start_increasing]
In [ ]:
# how to choose the bottom of the slope mathematically?
# try to see pct change

df_mono.pct_change().to_frame('gm_sq_pct_change').style.background_gradient(low=0,high=0.9)
In [ ]:
# look at smaller part than
df_mono_left = df_mono.pct_change()\
    .loc[df_mono.pct_change().index < df_mono.pct_change().idxmax()]

df_mono_left
In [ ]:
df_mono.idxmax().left, df_mono.idxmax().right
In [ ]:
# look at case when nbinsK = 9 and when the curve is going up
# index for when curve goes linearly up after initial flat
idx_bottom_of_slope_mono = [df_mono.index[num_start_increasing].left,
                       df_mono.index[num_start_increasing].right
                       ]

mag_low_nbinsK_mono = np.mean(idx_bottom_of_slope_mono)

# index for peak of curve
idx_peak_mono = [df_mono.idxmax().left,
            df_mono.idxmax().right
           ]
mag_high_nbinsK_mono = np.mean(idx_peak_mono)

idx_bottom_of_slope_mono, idx_peak_mono, mag_low_nbinsK_mono, mag_high_nbinsK_mono
In [ ]:
from scipy.optimize import curve_fit

xcol = 'mag_mono'
ycol = 'gm_sq'

x = df.query("""  @mag_low_nbinsK_mono < mag_mono <  @mag_high_nbinsK_mono  """)[xcol].to_numpy()
y = df.query("""  @mag_low_nbinsK_mono < mag_mono <  @mag_high_nbinsK_mono  """)[ycol].to_numpy()

def func(x, a, b):
    return a*x + b

params, _ = curve_fit(func, x, y)
[a, b] = params.round(2)

print(f'magnitude ranges for mono: {mag_low_nbinsK_mono}, {mag_high_nbinsK_mono}')
print(f'fitting params   for mono: {a}, {b}' )
In [ ]:
mag_low_nbinsK_mono
In [ ]:
def magnitude_weight_mono(mag):
    if mag < mag_low_nbinsK_mono:
        return 1/ (mag_low_nbinsK_mono *a + b)
    
    else:
        return 1/ (a*mag + b)

df['wt_mag_mono'] = df['mag_mono'].apply(magnitude_weight_mono)
df['wt_mag_mono'] = df['wt_mag_mono'] / df['wt_mag_mono'].mean() # normalize by mean

Magnitude weight column for Chromatic case

In [ ]:
df['mag_chro'].plot.hist(bins=100)
In [ ]:
nbins = 9 # the bin number looks good in above gm_sq plots
df_mono, df_chro = plot_bin_mag_mono_chro(nbins,show=True,ret=True)
In [ ]:
# from plot above I can see from Kth point graph goes up.
# graph is flat upto Kth point then increases linearly to top of curve.

num_start_increasing = 3 
df_chro.index[num_start_increasing]
In [ ]:
# look at case when nbinsK = 9 and when the curve is going up
# index for when curve goes linearly up after initial flat
idx_bottom_of_slope_chro = [df_chro.index[num_start_increasing].left,
                            df_chro.index[num_start_increasing].right
                            ]

mag_low_nbinsK_chro = np.mean(idx_bottom_of_slope_chro)

# index for peak of curve
idx_peak_chro = [df_chro.idxmax().left,
                 df_chro.idxmax().right
                 ]
mag_high_nbinsK_chro = np.mean(idx_peak_chro)

idx_bottom_of_slope_chro, idx_peak_chro, mag_low_nbinsK_chro, mag_high_nbinsK_chro
In [ ]:
from scipy.optimize import curve_fit


xcol = 'mag_chro'
ycol = 'gc_sq'

x = df.query("""  @mag_low_nbinsK_chro < mag_chro <  @mag_high_nbinsK_chro  """)[xcol].to_numpy()
y = df.query("""  @mag_low_nbinsK_chro < mag_chro <  @mag_high_nbinsK_chro  """)[ycol].to_numpy()

def func(x, a, b):
    return a*x + b

params, _ = curve_fit(func, x, y)
[a, b] = params.round(2)

print(f'magnitude ranges for chro: {mag_low_nbinsK_chro}, {mag_high_nbinsK_chro}')
print(f'fitting params   for chro: {a}, {b}' )
In [ ]:
mag_low_nbinsK_chro
In [ ]:
def magnitude_weight_chro(mag):
    if mag < mag_low_nbinsK_chro:
        return 1/ (mag_low_nbinsK_chro*a + b)
    
    else:
        return 1/ (a*mag + b)

df['wt_mag_chro'] = df['mag_chro'].apply(magnitude_weight_chro)
df['wt_mag_chro'] = df['wt_mag_chro'] / df['wt_mag_chro'].mean() # normalize by mean

# mean
df['wt_mag']      = (df['wt_mag_mono'] + df['wt_mag_chro']) / 2

# df.drop(['wt_mag_chro','wt_mag_mono'],axis=1,inplace=True)

df.iloc[:2,-7:]

Ellipticity Components Transformation

c2 = (dx * dx - dy * dy) / (r * r);
s2 = 2 * dx * dy / (r * r);
eX = s2 * e[0] + c2 * e[1];
eesum += eX * eX * w[0] * w[0];
eTsum[bin] -= (c2 * e[0] + s2 * e[1]) * w[0];
In [ ]:
df.head(2)
In [ ]:
# constants
RMIN = 10
DLNR = 0.5

df['dx'] = df['x[0]'] - 1699 # jesisim output fitsfiles have shape 3398, 3398
df['dy'] = df['x[1]'] - 1699

df['r'] = np.hypot(df['dx'], df['dy'])
# df['r'] = np.sqrt(df['dx']**2 + df['dy']**2)

df['cos2theta'] = df.eval(' (dx * dx - dy * dy) / (r * r)' )
df['sin2theta'] = df.eval(' (2  * dx * dy     ) / (r * r)' )

df['bin'] = ( np.log(df.r / RMIN) / DLNR).astype(int)

df['bin'].value_counts()
In [ ]:
df['eX_mono'] =       df['sin2theta'] * df['gm[0]'] + df['cos2theta'] * df['gm[1]']
df['eT_mono'] = -1 * (df['cos2theta'] * df['gm[0]'] + df['sin2theta'] * df['gm[1]'] ) 

df['eX_chro'] =       df['sin2theta'] * df['gc[0]'] + df['cos2theta'] * df['gc[1]']
df['eT_chro'] = -1 * (df['cos2theta'] * df['gc[0]'] + df['sin2theta'] * df['gc[1]']  )
In [ ]:
df['eT_mono_times_wt'] = df['eT_mono'] * df['wt_mag']
df['eT_chro_times_wt'] = df['eT_chro'] * df['wt_mag']
In [ ]:
df['eX_monosq_times_wt'] = df['eX_mono'] *  df['eX_mono'] * df['wt_mag']
df['eX_chrosq_times_wt'] = df['eX_chro'] *  df['eX_chro'] * df['wt_mag']

df['eXsq_times_wt'] = df.eval(" (eX_monosq_times_wt + eX_chrosq_times_wt) / 2 ")

df.iloc[:2,-6:]

Error Analysis

https://www.phenix.bnl.gov/WWW/publish/elke/EIC/Files-for-Wiki/lara.02-008.errors.pdf

When the statistics involved in calculating $E_A$ and $E_B$ are not indendent, the error for a function $f(E_A, E_B)$ has the expression: $$ \sigma_{f}=\sqrt{\left(\frac{\partial f}{\partial E_{A}} \sigma_{A}\right)^{2}+\left(\frac{\partial f}{\partial E_{B}} \sigma_{B}\right)^{2}+2 \frac{\partial f}{\partial E_{A}} \frac{\partial f}{\partial E_{B}} \operatorname{cov}\left(E_{A}, E_{B}\right)} $$

where the last term takes care of the correlations between $E_A$ and $E_B$. Given a large number $N$ of measurements $E_{A_i}$, the standard deviation $\sigma_A$ is empirically defined as:

$$ \sigma_{A}^{2}=\frac{1}{N-1} \sum_{i=1}^{N}\left(E_{A_{i}}-E_{A}\right)^{2} $$

while the covariance between $E_A$ and $E_B$ is given by: $$ \operatorname{cov}\left(E_{A}, E_{B}\right)=\frac{1}{N-1} \sum_{i=1}^{N}\left(E_{A_{i}}-E_{A}\right)\left(E_{B_{i}}-E_{B}\right) $$

where $E_A$ and $E_B$ are the averages of $E_{A_i}$ and $E_{B_i}$. When $E_A$ and $E_B$ are independent, over a large number $N$ of measurements they will fluctuate around their average in an uncorrelated way, so that the covariance is zero and one recovers the usual formula for the propagation of errors in a function of independent variables.

In [ ]:
"""
f = m/c
df/dm = 1/c
df/dc = -m/c2

s2 ==> sigma
s2 = r2 (sm2/m2 + sc2/c2 -2/m/c cov(m,c))

put m=c,
s2 = 2/m2 (sm2 - cov)

error = sigma/sqrt(n)

error = 
            0.000332
      sqrt  --------
             eT(bin)**2
    ---------------------
     sqrt(ngals_bin)
""";
In [ ]:
cov = df[['eT_chro','eT_mono']].cov()
cov
In [ ]:
diag_mean = np.diag(cov).mean()
diag_mean
In [ ]:
offdiag = cov.iloc[0,1]
offdiag
In [ ]:
diag_mean - offdiag
In [ ]:
myerr = (diag_mean - offdiag) * 2
myerr
In [ ]:
# temporary value to calculate error
# err_coeff = 0.000332

err_coeff = myerr

df['err_numerator'] = myerr  # np.sqrt(err_coeff/df['eT_mono']/df['eT_mono'])
df['eX_monosq_times_wt_std'] = df['eX_monosq_times_wt'].std()
df['eX_chrosq_times_wt_std'] = df['eX_chrosq_times_wt'].std()
df['eT_ratio'] = df['eT_mono'] / df['eT_chro']
df['eT_ratio_std'] = df['eT_ratio'].std()
df.iloc[:2,-5:]

Radial Binnings

In [ ]:
# constants
RMIN = 10
DLNR = 0.5
In [ ]:
# renaming aggegration needs pandas > 0.25
pd.__version__
In [ ]:
df.filter(regex='times|err_numerator').head().round(1)
In [ ]:
df_radial_bins = df.groupby('bin').agg(
    r_mean                 = ('r', 'mean'),
    wt_mag_sum             = ('wt_mag', 'sum'),
    eT_mono_times_wt_sum   = ('eT_mono_times_wt', 'sum'),
    eT_chro_times_wt_sum   = ('eT_chro_times_wt', 'sum'),
    eX_monosq_times_wt_sum = ('eX_monosq_times_wt', 'sum'),
    eX_chrosq_times_wt_sum = ('eX_chrosq_times_wt', 'sum'),
    err_numerator_mean     = ('err_numerator', 'mean')
                                       )


# bin counts
df_radial_bins['bin_count'] = df['bin'].value_counts()

# columns after binning 
df_radial_bins['eT_mean_mono'] = (df_radial_bins['eT_mono_times_wt_sum'] / 
                                  df_radial_bins['wt_mag_sum'])

df_radial_bins['eT_mean_chro'] = (df_radial_bins['eT_chro_times_wt_sum'] /
                                  df_radial_bins['wt_mag_sum'])
               
df_radial_bins['eX_mean_mono'] = np.sqrt(df_radial_bins['eX_monosq_times_wt_sum'] /
                                         df_radial_bins['wt_mag_sum'])
                                                     
df_radial_bins['eX_mean_chro'] = np.sqrt(df_radial_bins['eX_chrosq_times_wt_sum'] /
                                         df_radial_bins['wt_mag_sum'])
               
df_radial_bins['eT_ratio']     = (df_radial_bins['eT_mean_mono'] /
                                  df_radial_bins['eT_mean_chro'])


# Errors
sqrt_bin = np.sqrt(df_radial_bins['bin_count'])
df_radial_bins['eT_mono_err'] = df_radial_bins['eX_mean_mono'] / sqrt_bin
df_radial_bins['eT_chro_err'] = df_radial_bins['eX_mean_chro'] / sqrt_bin
df_radial_bins['eT_ratio_err'] = ( np.sqrt(df_radial_bins['err_numerator_mean']/
                                  df_radial_bins['eT_mean_mono'] /
                                  df_radial_bins['eT_mean_chro']) /
                                   sqrt_bin)

print('Statistics for different radial bins')
print(f'RMIN = {RMIN} and DLNR = {DLNR}')

df_radial_bins.style\
.background_gradient(subset=['eT_ratio_err'],cmap='Blues')
In [ ]:
# why some eT values are -ve?
"""
1. For given rmin and dlnr we have some bins very few object counts.

""";
In [ ]:
pd.cut(df['eT_mono'],20).value_counts().to_frame().style.background_gradient()
In [ ]:
fig,ax = plt.subplots(figsize=(12,8))
sns.distplot(df['eT_mono'],ax=ax)
plt.title('Histogram and density plot of eT_mono');
In [ ]:
df['wt_mag'].hist(bins=100,figsize=(12,8));
plt.title('Histogram of wt_mag')
plt.xlabel('wt_mag')
plt.ylabel('Frequency');
plt.savefig('images/weights.png',dpi=300)
In [ ]:
df.query(""" r>500 """)['eT_mono'].describe().to_frame()
In [ ]:
df['gm_sq'].hist(bins=100,figsize=(12,8))
plt.title('Histogram of gm_sq')
plt.ylabel('Frequency')
plt.xlabel('gm_sq')
plt.savefig('images/gm_sq_hist_after_cleaning.png',dpi=300)
In [ ]:
df.head(2)

Plot of eT for mono and chro

In [ ]:
def plot_eT_mean(rmin, dlnr,min_bin_count,
                 show_ratio=True,
                 show_mono_chro=False,
                 show_data=True):
    # title
    title = f'rmin={rmin}, dlnr={dlnr}, min_bin_count={min_bin_count}'
    
    # radial bin
    df['bin'] = ( np.log(df.r / rmin) / dlnr).astype(int)

    # aggregations per given bin
    df_radial_bins = df.groupby('bin').agg(
        r_mean                 = ('r', 'mean'),
        wt_mag_sum             = ('wt_mag', 'sum'),
        eT_mono_times_wt_sum   = ('eT_mono_times_wt', 'sum'),
        eT_chro_times_wt_sum   = ('eT_chro_times_wt', 'sum'),
        eX_monosq_times_wt_sum = ('eX_monosq_times_wt', 'sum'),
        eX_chrosq_times_wt_sum = ('eX_chrosq_times_wt', 'sum'),
        err_numerator_mean     = ('err_numerator', 'mean')
                                           )
    
    # bin counts
    df_radial_bins['bin_count'] = df['bin'].value_counts()
    df_radial_bins = df_radial_bins.query(""" bin_count > @min_bin_count """) 
    df_radial_bins = df_radial_bins.iloc[:-1,:]

    # columns after binning 
    df_radial_bins['eT_mean_mono'] = (df_radial_bins['eT_mono_times_wt_sum'] / 
                                      df_radial_bins['wt_mag_sum'])

    df_radial_bins['eT_mean_chro'] = (df_radial_bins['eT_chro_times_wt_sum'] /
                                      df_radial_bins['wt_mag_sum'])

    df_radial_bins['eX_mean_mono'] = np.sqrt(
        df_radial_bins['eX_monosq_times_wt_sum'] /
        df_radial_bins['wt_mag_sum']
    )

    df_radial_bins['eX_mean_chro'] = np.sqrt(
        df_radial_bins['eX_chrosq_times_wt_sum'] /
        df_radial_bins['wt_mag_sum']
    )

    df_radial_bins['eT_ratio']     = (df_radial_bins['eT_mean_mono'] /
                                      df_radial_bins['eT_mean_chro'])

    
    # Errors
    sqrt_bin = np.sqrt(df_radial_bins['bin_count'])
    df_radial_bins['eT_mono_err'] = df_radial_bins['eX_mean_mono'] / sqrt_bin
    df_radial_bins['eT_chro_err'] = df_radial_bins['eX_mean_chro'] / sqrt_bin
    df_radial_bins['eT_ratio_err'] = ( np.sqrt(df_radial_bins['err_numerator_mean']/
                                  df_radial_bins['eT_mean_mono'] /
                                  df_radial_bins['eT_mean_chro']) /
                                   sqrt_bin)
 
    # also plot eT mono and eT chro
    if show_ratio:
        fig, ax = plt.subplots(1,1,figsize=(12,8))
        df_radial_bins.plot.line(x='r_mean',y='eT_ratio',
                                 yerr='eT_ratio_err',
                                 marker='o',color='b',ax=ax)
        ax.set_xlabel('Radius',fontsize=18)
        ax.set_ylabel(r'$\frac{eT_{mono}}{eT_{chro}}$',fontsize=18)
        plt.ylim(0.95, 1.0)
        
    if show_mono_chro:
        fig, ax = plt.subplots(3,1,figsize=(14,12))
        
        # top
        df_radial_bins.plot.line(x='r_mean',y='eT_mean_mono',
                                 yerr='eT_mono_err',
                                 marker='o',color='r', ax=ax[0])
        
        # top
        df_radial_bins.plot.line(x='r_mean',y='eT_mean_chro',
                                 yerr='eT_chro_err',
                                 marker='o',color='b',ax=ax[0])
        # middle
        df_radial_bins.plot.line(x='r_mean',y='eT_mean_mono',
                                 yerr='eT_mono_err',
                                 marker='o',color='r', ax=ax[1])
        
        # bottom
        df_radial_bins.plot.line(x='r_mean',y='eT_mean_chro',
                                 yerr='eT_chro_err',
                                 marker='o',color='b',ax=ax[2])
        
        # label
        ax[0].set_xlabel('')
        ax[1].set_xlabel('')
        ax[2].set_xlabel('Radius',fontsize=18)
        ax[0].set_ylabel('eT',fontsize=18)
        ax[1].set_ylabel('eT_mean_mono',fontsize=18)
        ax[2].set_ylabel('eT_mean_chro',fontsize=18)
        ax[0].set_xlim(0,1800)
        ax[1].set_xlim(0,1800)
        ax[2].set_xlim(0,1800)

    plt.suptitle(f'Plot of eT for {title}',fontsize=24,weight='bold')
    
    # also show the dataframe
    if show_data:
        display(df_radial_bins.style.background_gradient(subset=['eT_mean_mono']))
    
    
    plt.xlim(0,1800)
    plt.savefig('images/eT_versus_radius.png',dpi=300)
    plt.tight_layout()
    plt.show()

# plot now
plot_eT_mean(rmin=300, dlnr=0.5,min_bin_count=10,
            show_ratio=True,show_mono_chro=False)
In [ ]:
plot_eT_mean(rmin=300, dlnr=0.5,min_bin_count=10,
            show_ratio=False,show_mono_chro=True)

Interactive plots

In [ ]:
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
In [ ]:
df.filter(regex='wt').head(2)
In [ ]:
# I need pandas > 0.25 for multiple agg
pd.__version__
In [ ]:
def plotly_eT_plot(rmin=400, dlnr=0.4, min_bin_count=20,
                   show_mono_chro=False,
                   show_mono=False,
                   show_chro=False,
                   show_data=True,
                  ):
    
    # title
    title=f'rmin={rmin}, dlnr={dlnr}, min_bin_count={min_bin_count}'
    
    # radial bin
    df['bin'] = ( np.log(df.r / rmin) / dlnr).astype(int)

    # aggregations per given bin
    df_radial_bins = df.groupby('bin').agg(
        r_mean                 = ('r', 'mean'),
        wt_mag_sum             = ('wt_mag', 'sum'),
        eT_mono_times_wt_sum   = ('eT_mono_times_wt', 'sum'),
        eT_chro_times_wt_sum   = ('eT_chro_times_wt', 'sum'),
        eX_monosq_times_wt_sum = ('eX_monosq_times_wt', 'sum'),
        eX_chrosq_times_wt_sum = ('eX_chrosq_times_wt', 'sum'),
        err_numerator_mean     = ('err_numerator', 'mean')
                                           )
    

    # bin counts
    df_radial_bins['bin_count'] = df['bin'].value_counts()
    df_radial_bins = df_radial_bins.query(""" bin_count > @min_bin_count """) 
    df_radial_bins = df_radial_bins.iloc[:-1,:]
        
    # columns after binning 
    df_radial_bins['eT_mean_mono'] = (df_radial_bins['eT_mono_times_wt_sum'] / 
                                      df_radial_bins['wt_mag_sum'])

    df_radial_bins['eT_mean_chro'] = (df_radial_bins['eT_chro_times_wt_sum'] /
                                      df_radial_bins['wt_mag_sum'])

    df_radial_bins['eX_mean_mono'] = np.sqrt(
        df_radial_bins['eX_monosq_times_wt_sum'] /
        df_radial_bins['wt_mag_sum']
    )

    df_radial_bins['eX_mean_chro'] = np.sqrt(
        df_radial_bins['eX_chrosq_times_wt_sum'] /
        df_radial_bins['wt_mag_sum']
    )

    df_radial_bins['eT_ratio']     = (df_radial_bins['eT_mean_mono'] /
                                      df_radial_bins['eT_mean_chro'])

    
    # Errors
    sqrt_bin = np.sqrt(df_radial_bins['bin_count'])
    df_radial_bins['eT_mono_err'] = df_radial_bins['eX_mean_mono'] / sqrt_bin
    df_radial_bins['eT_chro_err'] = df_radial_bins['eX_mean_chro'] / sqrt_bin
    df_radial_bins['eT_ratio_err'] = ( np.sqrt(df_radial_bins['err_numerator_mean']/
                                  df_radial_bins['eT_mean_mono'] /
                                  df_radial_bins['eT_mean_chro']) /
                                   sqrt_bin)
    
    # remove first bin of strong lensing
    df_radial_bins = df_radial_bins.iloc[1:,:]
    
    # eT mono vs chro ratio
    sc = go.Scatter( x=df_radial_bins['r_mean'],
                     y=df_radial_bins['eT_ratio'],
                     mode = 'lines+markers',
                     name = 'eT ratio',
                     error_y = dict(
                         type='data',
                         array=df_radial_bins['eT_ratio_err'],
                         visible=True,
                         )
                   )
    
    mydata = [sc]
    
    # layout
    layout = go.Layout(
                    title={
                        'text': title,
                        'y':0.9,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                    xaxis=dict(
                               title='radius',
                               titlefont=dict(
                               family='Courier New, monospace',
                               size=18,
                               color='#7f7f7f')),
                     yaxis=dict(title='eT',
                                titlefont=dict(
                                          family='Courier New, monospace',
                                          size=18,
                                          color='#7f7f7f')))
    
    myfig = go.Figure(data=mydata, layout=layout)

    
    # also plot eT mono and chro
    # monochromatic
    sc1 = go.Scatter(x=df_radial_bins['r_mean'],
                     y=df_radial_bins['eT_mean_mono'],
                     mode = 'lines+markers',
                     name = 'eT mono',
                     error_y = dict(
                         type='data',
                         array=df_radial_bins['eT_mono_err'],
                         visible=True,
                         )
                    )
        # chromatic
    sc2 = go.Scatter(x=df_radial_bins['r_mean'],
                     y=df_radial_bins['eT_mean_chro'],
                     mode = 'lines+markers',
                     name = 'eT chro',
                     error_y = dict(
                         type='data',
                         array=df_radial_bins['eT_chro_err'],
                         visible=True,
                         )
                    )
    if show_mono_chro:
        mydata= [sc1,sc2]
        myfig = go.Figure(data=mydata, layout=layout)
        
    if show_mono:
        mydata = [sc1]
        myfig = go.Figure(data=mydata, layout=layout)
        myfig.update_layout(
                    xaxis_title="Radius",
                    yaxis_title="eT_mono",
                )
        myfig.update_layout(
                    title={
                        'text': "eT_mono vs radius plot",
                        'y':0.9,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'}
        )
        
    if show_chro:
        mydata = [sc2]
        myfig = go.Figure(data=mydata, layout=layout)
        myfig.update_layout(
                    xaxis_title="Radius",
                    yaxis_title="eT_chro",
                )
        myfig.update_layout(
                    title={
                        'text': "eT_chro vs radius plot",
                        'y':0.9,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'})
    
    # this is for all cases
    myfig.update_layout(showlegend=True)

    
    # also show the dataframe
    if show_data:
        display(df_radial_bins.style.background_gradient(subset=['eT_mean_mono']))

    # iplot plots in jupyter-notebook, plot opens in new tab.
    return iplot(myfig, filename='myplot.html')

# plot
plotly_eT_plot(rmin=400, dlnr=0.4, min_bin_count=20,
                   show_mono_chro=False,
                   show_mono=False,
                   show_chro=False,
                   show_data=True,
                  )
In [ ]:
plotly_eT_plot(rmin=300, dlnr=0.5,min_bin_count=20,show_mono_chro=True)
In [ ]:
plotly_eT_plot(rmin=300, dlnr=0.5,min_bin_count=20,show_data=False,show_mono=True)
In [ ]:
plotly_eT_plot(rmin=300, dlnr=0.5,min_bin_count=20,show_data=False,show_chro=True)

ipywidgets

which pip
pip install ipywidgets
jupyter nbextension enable --py --sys-prefix widgetsnbextension
In [ ]:
import ipywidgets as widgets
from ipywidgets import HBox, VBox
from ipywidgets import interactive
In [ ]:
rmin_ = widgets.IntSlider(min=0, max=500, step=50, value=300)
dlnr_  = widgets.FloatSlider(min=0.1,max=0.8,step=0.05,value=0.5)
min_bin_count_ = widgets.IntSlider(min=10, max=600, step=20, value=20)
interactive_plot = interactive(plotly_eT_plot,
                               rmin=rmin_,
                               dlnr=dlnr_,
                               min_bin_count=min_bin_count_)

output = interactive_plot.children[-1]
interactive_plot

Time Taken

In [ ]:
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
      '{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
In [ ]:
import subprocess
subprocess.call(['python', '-m', 'nbconvert', '*.ipynb'])
In [ ]:
!python get_nbviewer_links.py
In [ ]: